// Written in the D programming language.

/**
 * Builtin SIMD intrinsics
 *
 * Source: $(DRUNTIMESRC core/_simd.d)
 *
 * Copyright: Copyright Digital Mars 2012.
 * License:   $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
 * Authors:   $(WEB digitalmars.com, Walter Bright),
 */

/* NOTE: This file has been patched from the original DMD distribution to
 * work with the GDC compiler.
 */
module core.simd;

pure:
nothrow:
@safe:
@nogc:

/*******************************
 * Create a vector type.
 *
 * Parameters:
 *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
 *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
 *      For 256 bit vectors,
 *      one of double[4], float[8], void[32], byte[32], ubyte[32],
 *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
 */

template Vector(T)
{
    /* __vector is compiler magic, hide it behind a template.
     * The compiler will reject T's that don't work.
     */
    alias __vector(T) Vector;
}

/* Handy aliases
 */
static if (is(Vector!(void[8])))    alias Vector!(void[8])  void8;          ///
static if (is(Vector!(float[2])))   alias Vector!(float[2])  float2;        ///
static if (is(Vector!(byte[8])))    alias Vector!(byte[8])  byte8;          ///
static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8]) ubyte8;         ///
static if (is(Vector!(short[4])))   alias Vector!(short[4])  short4;        ///
static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4]) ushort4;       ///
static if (is(Vector!(int[2])))     alias Vector!(int[2])    int2;          ///
static if (is(Vector!(uint[2])))    alias Vector!(uint[2])   uint2;         ///

static if (is(Vector!(void[16])))   alias Vector!(void[16])  void16;        ///
static if (is(Vector!(double[2])))  alias Vector!(double[2]) double2;       ///
static if (is(Vector!(float[4])))   alias Vector!(float[4])  float4;        ///
static if (is(Vector!(byte[16])))   alias Vector!(byte[16])  byte16;        ///
static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16]) ubyte16;       ///
static if (is(Vector!(short[8])))   alias Vector!(short[8])  short8;        ///
static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8]) ushort8;       ///
static if (is(Vector!(int[4])))     alias Vector!(int[4])    int4;          ///
static if (is(Vector!(uint[4])))    alias Vector!(uint[4])   uint4;         ///
static if (is(Vector!(long[2])))    alias Vector!(long[2])   long2;         ///
static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])  ulong2;        ///

static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;        ///
static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;       ///
static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;        ///
static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;        ///
static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;       ///
static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;       ///
static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;      ///
static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;          ///
static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;         ///
static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;         ///
static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;        ///

version (D_SIMD)
{
  /** XMM opcodes that conform to the following:
   *
   *  opcode xmm1,xmm2/mem
   *
   * and do not have side effects (i.e. do not write to memory).
   */
  enum XMM
  {
    ADDSS = 0xF30F58,
    ADDSD = 0xF20F58,
    ADDPS = 0x000F58,
    ADDPD = 0x660F58,
    PADDB = 0x660FFC,
    PADDW = 0x660FFD,
    PADDD = 0x660FFE,
    PADDQ = 0x660FD4,

    SUBSS = 0xF30F5C,
    SUBSD = 0xF20F5C,
    SUBPS = 0x000F5C,
    SUBPD = 0x660F5C,
    PSUBB = 0x660FF8,
    PSUBW = 0x660FF9,
    PSUBD = 0x660FFA,
    PSUBQ = 0x660FFB,

    MULSS = 0xF30F59,
    MULSD = 0xF20F59,
    MULPS = 0x000F59,
    MULPD = 0x660F59,
    PMULLW = 0x660FD5,

    DIVSS = 0xF30F5E,
    DIVSD = 0xF20F5E,
    DIVPS = 0x000F5E,
    DIVPD = 0x660F5E,

    PAND  = 0x660FDB,
    POR   = 0x660FEB,

    UCOMISS = 0x000F2E,
    UCOMISD = 0x660F2E,

    XORPS = 0x000F57,
    XORPD = 0x660F57,

    // Use STO and LOD instead of MOV to distinguish the direction
    STOSS  = 0xF30F11,
    STOSD  = 0xF20F11,
    STOAPS = 0x000F29,
    STOAPD = 0x660F29,
    STODQA = 0x660F7F,
    STOD   = 0x660F7E,        // MOVD reg/mem64, xmm   66 0F 7E /r
    STOQ   = 0x660FD6,

    LODSS  = 0xF30F10,
    LODSD  = 0xF20F10,
    LODAPS = 0x000F28,
    LODAPD = 0x660F28,
    LODDQA = 0x660F6F,
    LODD   = 0x660F6E,        // MOVD xmm, reg/mem64   66 0F 6E /r
    LODQ   = 0xF30F7E,

    LODDQU   = 0xF30F6F,      // MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
    STODQU   = 0xF30F7F,      // MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
    MOVDQ2Q  = 0xF20FD6,      // MOVDQ2Q mmx, xmm          F2 0F D6 /r
    MOVHLPS  = 0x0F12,        // MOVHLPS xmm1, xmm2        0F 12 /r
    LODHPD   = 0x660F16,
    STOHPD   = 0x660F17,      // MOVHPD mem64, xmm         66 0F 17 /r
    LODHPS   = 0x0F16,
    STOHPS   = 0x0F17,
    MOVLHPS  = 0x0F16,
    LODLPD   = 0x660F12,
    STOLPD   = 0x660F13,
    LODLPS   = 0x0F12,
    STOLPS   = 0x0F13,
    MOVMSKPD = 0x660F50,
    MOVMSKPS = 0x0F50,
    MOVNTDQ  = 0x660FE7,
    MOVNTI   = 0x0FC3,
    MOVNTPD  = 0x660F2B,
    MOVNTPS  = 0x0F2B,
    MOVNTQ   = 0x0FE7,
    MOVQ2DQ  = 0xF30FD6,
    LODUPD   = 0x660F10,
    STOUPD   = 0x660F11,
    LODUPS   = 0x0F10,
    STOUPS   = 0x0F11,

    PACKSSDW = 0x660F6B,
    PACKSSWB = 0x660F63,
    PACKUSWB = 0x660F67,
    PADDSB = 0x660FEC,
    PADDSW = 0x660FED,
    PADDUSB = 0x660FDC,
    PADDUSW = 0x660FDD,
    PANDN = 0x660FDF,
    PCMPEQB = 0x660F74,
    PCMPEQD = 0x660F76,
    PCMPEQW = 0x660F75,
    PCMPGTB = 0x660F64,
    PCMPGTD = 0x660F66,
    PCMPGTW = 0x660F65,
    PMADDWD = 0x660FF5,
    PSLLW = 0x660FF1,
    PSLLD = 0x660FF2,
    PSLLQ = 0x660FF3,
    PSRAW = 0x660FE1,
    PSRAD = 0x660FE2,
    PSRLW = 0x660FD1,
    PSRLD = 0x660FD2,
    PSRLQ = 0x660FD3,
    PSUBSB = 0x660FE8,
    PSUBSW = 0x660FE9,
    PSUBUSB = 0x660FD8,
    PSUBUSW = 0x660FD9,
    PUNPCKHBW = 0x660F68,
    PUNPCKHDQ = 0x660F6A,
    PUNPCKHWD = 0x660F69,
    PUNPCKLBW = 0x660F60,
    PUNPCKLDQ = 0x660F62,
    PUNPCKLWD = 0x660F61,
    PXOR = 0x660FEF,
    ANDPD = 0x660F54,
    ANDPS = 0x0F54,
    ANDNPD = 0x660F55,
    ANDNPS = 0x0F55,
    CMPPS = 0x0FC2,
    CMPPD = 0x660FC2,
    CMPSD = 0xF20FC2,
    CMPSS = 0xF30FC2,
    COMISD = 0x660F2F,
    COMISS = 0x0F2F,
    CVTDQ2PD = 0xF30FE6,
    CVTDQ2PS = 0x0F5B,
    CVTPD2DQ = 0xF20FE6,
    CVTPD2PI = 0x660F2D,
    CVTPD2PS = 0x660F5A,
    CVTPI2PD = 0x660F2A,
    CVTPI2PS = 0x0F2A,
    CVTPS2DQ = 0x660F5B,
    CVTPS2PD = 0x0F5A,
    CVTPS2PI = 0x0F2D,
    CVTSD2SI = 0xF20F2D,
    CVTSD2SS = 0xF20F5A,
    CVTSI2SD = 0xF20F2A,
    CVTSI2SS = 0xF30F2A,
    CVTSS2SD = 0xF30F5A,
    CVTSS2SI = 0xF30F2D,
    CVTTPD2PI = 0x660F2C,
    CVTTPD2DQ = 0x660FE6,
    CVTTPS2DQ = 0xF30F5B,
    CVTTPS2PI = 0x0F2C,
    CVTTSD2SI = 0xF20F2C,
    CVTTSS2SI = 0xF30F2C,
    MASKMOVDQU = 0x660FF7,
    MASKMOVQ = 0x0FF7,
    MAXPD = 0x660F5F,
    MAXPS = 0x0F5F,
    MAXSD = 0xF20F5F,
    MAXSS = 0xF30F5F,
    MINPD = 0x660F5D,
    MINPS = 0x0F5D,
    MINSD = 0xF20F5D,
    MINSS = 0xF30F5D,
    ORPD = 0x660F56,
    ORPS = 0x0F56,
    PAVGB = 0x660FE0,
    PAVGW = 0x660FE3,
    PMAXSW = 0x660FEE,
    //PINSRW = 0x660FC4,
    PMAXUB = 0x660FDE,
    PMINSW = 0x660FEA,
    PMINUB = 0x660FDA,
    //PMOVMSKB = 0x660FD7,
    PMULHUW = 0x660FE4,
    PMULHW = 0x660FE5,
    PMULUDQ = 0x660FF4,
    PSADBW = 0x660FF6,
    PUNPCKHQDQ = 0x660F6D,
    PUNPCKLQDQ = 0x660F6C,
    RCPPS = 0x0F53,
    RCPSS = 0xF30F53,
    RSQRTPS = 0x0F52,
    RSQRTSS = 0xF30F52,
    SQRTPD = 0x660F51,
    SHUFPD = 0x660FC6,
    SHUFPS = 0x0FC6,
    SQRTPS = 0x0F51,
    SQRTSD = 0xF20F51,
    SQRTSS = 0xF30F51,
    UNPCKHPD = 0x660F15,
    UNPCKHPS = 0x0F15,
    UNPCKLPD = 0x660F14,
    UNPCKLPS = 0x0F14,

    PSHUFD = 0x660F70,
    PSHUFHW = 0xF30F70,
    PSHUFLW = 0xF20F70,
    PSHUFW = 0x0F70,
    PSLLDQ = 0x07660F73,
    PSRLDQ = 0x03660F73,

    //PREFETCH = 0x0F18,

// SSE3 Pentium 4 (Prescott)

    ADDSUBPD = 0x660FD0,
    ADDSUBPS = 0xF20FD0,
    HADDPD   = 0x660F7C,
    HADDPS   = 0xF20F7C,
    HSUBPD   = 0x660F7D,
    HSUBPS   = 0xF20F7D,
    MOVDDUP  = 0xF20F12,
    MOVSHDUP = 0xF30F16,
    MOVSLDUP = 0xF30F12,
    LDDQU    = 0xF20FF0,
    MONITOR  = 0x0F01C8,
    MWAIT    = 0x0F01C9,

// SSSE3
    PALIGNR = 0x660F3A0F,
    PHADDD = 0x660F3802,
    PHADDW = 0x660F3801,
    PHADDSW = 0x660F3803,
    PABSB = 0x660F381C,
    PABSD = 0x660F381E,
    PABSW = 0x660F381D,
    PSIGNB = 0x660F3808,
    PSIGND = 0x660F380A,
    PSIGNW = 0x660F3809,
    PSHUFB = 0x660F3800,
    PMADDUBSW = 0x660F3804,
    PMULHRSW = 0x660F380B,
    PHSUBD = 0x660F3806,
    PHSUBW = 0x660F3805,
    PHSUBSW = 0x660F3807,

// SSE4.1

    BLENDPD   = 0x660F3A0D,
    BLENDPS   = 0x660F3A0C,
    BLENDVPD  = 0x660F3815,
    BLENDVPS  = 0x660F3814,
    DPPD      = 0x660F3A41,
    DPPS      = 0x660F3A40,
    EXTRACTPS = 0x660F3A17,
    INSERTPS  = 0x660F3A21,
    MPSADBW   = 0x660F3A42,
    PBLENDVB  = 0x660F3810,
    PBLENDW   = 0x660F3A0E,
    PEXTRD    = 0x660F3A16,
    PEXTRQ    = 0x660F3A16,
    PINSRB    = 0x660F3A20,
    PINSRD    = 0x660F3A22,
    PINSRQ    = 0x660F3A22,

    MOVNTDQA = 0x660F382A,
    PACKUSDW = 0x660F382B,
    PCMPEQQ = 0x660F3829,
    PEXTRB = 0x660F3A14,
    PHMINPOSUW = 0x660F3841,
    PMAXSB = 0x660F383C,
    PMAXSD = 0x660F383D,
    PMAXUD = 0x660F383F,
    PMAXUW = 0x660F383E,
    PMINSB = 0x660F3838,
    PMINSD = 0x660F3839,
    PMINUD = 0x660F383B,
    PMINUW = 0x660F383A,
    PMOVSXBW = 0x660F3820,
    PMOVSXBD = 0x660F3821,
    PMOVSXBQ = 0x660F3822,
    PMOVSXWD = 0x660F3823,
    PMOVSXWQ = 0x660F3824,
    PMOVSXDQ = 0x660F3825,
    PMOVZXBW = 0x660F3830,
    PMOVZXBD = 0x660F3831,
    PMOVZXBQ = 0x660F3832,
    PMOVZXWD = 0x660F3833,
    PMOVZXWQ = 0x660F3834,
    PMOVZXDQ = 0x660F3835,
    PMULDQ   = 0x660F3828,
    PMULLD   = 0x660F3840,
    PTEST    = 0x660F3817,

    ROUNDPD = 0x660F3A09,
    ROUNDPS = 0x660F3A08,
    ROUNDSD = 0x660F3A0B,
    ROUNDSS = 0x660F3A0A,

// SSE4.2
    PCMPESTRI  = 0x660F3A61,
    PCMPESTRM  = 0x660F3A60,
    PCMPISTRI  = 0x660F3A63,
    PCMPISTRM  = 0x660F3A62,
    PCMPGTQ    = 0x660F3837,
    //CRC32

// SSE4a (AMD only)
    // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS

// POPCNT and LZCNT (have their own CPUID bits)
    POPCNT     = 0xF30FB8,
    // LZCNT
  }

  /**
   * Generate two operand instruction with XMM 128 bit operands.
   *
   * This is a compiler magic function - it doesn't behave like
   * regular D functions.
   *
   * Parameters:
   *      opcode  any of the XMM opcodes; it must be a compile time constant
   *      op1     first operand
   *      op2     second operand
   * Returns:
   *      result of opcode
   */
  pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);

  /**
   * Unary SIMD instructions.
   */
  pure @safe void16 __simd(XMM opcode, void16 op1);
  pure @safe void16 __simd(XMM opcode, double d);       ///
  pure @safe void16 __simd(XMM opcode, float f);        ///

  /****
   * For instructions:
   * CMPPD, CMPSS, CMPSD, CMPPS,
   * PSHUFD, PSHUFHW, PSHUFLW,
   * BLENDPD, BLENDPS, DPPD, DPPS,
   * MPSADBW, PBLENDW,
   * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
   * Parameters:
   *      opcode  any of the above XMM opcodes; it must be a compile time constant
   *      op1     first operand
   *      op2     second operand
   *      imm8    third operand; must be a compile time constant
   * Returns:
   *      result of opcode
   */
  pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);

  /***
   * For instructions with the imm8 version:
   * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
   * PSRLDQ, PSLLDQ
   * Parameters:
   *      opcode  any of the XMM opcodes; it must be a compile time constant
   *      op1     first operand
   *      imm8    second operand; must be a compile time constant
   * Returns:
   *      result of opcode
   */
  pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);

  /*****
   * For "store" operations of the form:
   *    op1 op= op2
   * Returns:
   *    op2
   * These cannot be marked as pure, as semantic() doesn't check them.
   */
  @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
  @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
  @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);  ///

  /* The following use overloading to ensure correct typing.
   * Compile with inlining on for best performance.
   */

  pure @safe short8 pcmpeq()(short8 v1, short8 v2)
  {
      return __simd(XMM.PCMPEQW, v1, v2);
  }

  pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
  {
      return __simd(XMM.PCMPEQW, v1, v2);
  }

  /*********************
   * Emit prefetch instruction.
   * Params:
   *    address = address to be prefetched
   *    writeFetch = true for write fetch, false for read fetch
   *    locality = 0..3 (0 meaning least local, 3 meaning most local)
   * Note:
   *    The Intel mappings are:
   *    $(TABLE
   *    $(THEAD writeFetch, locality, Instruction)
   *    $(TROW false, 0, prefetchnta)
   *    $(TROW false, 1, prefetch2)
   *    $(TROW false, 2, prefetch1)
   *    $(TROW false, 3, prefetch0)
   *    $(TROW false, 0, prefetchw)
   *    $(TROW false, 1, prefetchw)
   *    $(TROW false, 2, prefetchw)
   *    $(TROW false, 3, prefetchw)
   *    )
   */
  void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
  {
        static if (writeFetch)
            __prefetch(address, 4);
        else static if (locality < 4)
            __prefetch(address, 3 - locality);
        else
            static assert(0, "0..3 expected for locality");
  }

  private void __prefetch(const(void*) address, ubyte encoding);

  /*************************************
   * Load unaligned vector from address.
   * This is a compiler intrinsic.
   * Params:
   *    p = pointer to vector
   * Returns:
   *    vector
   */

  V loadUnaligned(V)(const V* p)
        if (is(V == void16) ||
            is(V == byte16) ||
            is(V == ubyte16) ||
            is(V == short8) ||
            is(V == ushort8) ||
            is(V == int4) ||
            is(V == uint4) ||
            is(V == long2) ||
            is(V == ulong2))
  {
        pragma(inline, true);
        static if (is(V == double2))
            return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
        else static if (is(V == float4))
            return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
        else
            return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
  }

  /*************************************
   * Store vector to unaligned address.
   * This is a compiler intrinsic.
   * Params:
   *    p = pointer to vector
   *    value = value to store
   * Returns:
   *    value
   */

  V storeUnaligned(V)(V* p, V value)
        if (is(V == void16) ||
            is(V == byte16) ||
            is(V == ubyte16) ||
            is(V == short8) ||
            is(V == ushort8) ||
            is(V == int4) ||
            is(V == uint4) ||
            is(V == long2) ||
            is(V == ulong2))
  {
        pragma(inline, true);
        static if (is(V == double2))
            return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
        else static if (is(V == float4))
            return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
        else
            return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
  }
}
